#########################################################
## @file   : tbeautifulsoap.py
## @desc   : test tbeautifulsoap.py 
## @create : 2020/07/16
## @author : Chengan
## @email  : douboer@gmail.com
#########################################################


from bs4 import BeautifulSoup
from collections import defaultdict
import re

with open('./tdouban.data.t', 'r', encoding='utf8', errors='ignore') as f:
    resp=f.read()

soup = BeautifulSoup(resp, "html.parser")

# TEST
#print(soup.head) #head标签内容
#print(soup.head.contents) #head标签子节点的内容
#print(soup.body.contents) #body标签子节点的内容
#print(len(soup.body.contents)) #body标签的子节点层数
#print(soup.body.contents[7]) #
#print(soup.div)
#tag = soup.a
#print(tag['class'])
#print(len(soup.contents))
#tag = soup.children
#tag = soup.contents
#tag = soup.descendants
#print(len(list(tag)))
"""
for t in tag:
    print('txxxxxxxxxxxxxxxxxxxxxxx')
    print(t)
"""

#onclick="moreurl(this,{i: '0', query: '24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE', from: 'dou_search_book', sid: 3845101, qcat: ''})"
rec = re.compile('^.+sid: (\d+),.+$')

bkinfo = defaultdict(dict)
idx = 0

#<div class="result">
atag = soup.find_all("div", attrs={"class": "result"})
for t in atag:
    #print(t.find_all('span'))

    # sp is type of bs4.element.Tag
    sp = t.span
    
    if not sp:
        continue
    elif '书籍' in sp.string:

        #<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F3845101%2F&amp;query=24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE&amp;cat_id=1001&amp;type=search&amp;pos=0" onclick="moreurl(this,{i: '0', query: '24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE', from: 'dou_search_book', sid: 3845101, qcat: ''})" target="_blank" title="24堂财富课"><img src="https://img9.doubanio.com/view/subject/s/public/s29357535.jpg"/></a>
        nbg = t.find('a', attrs={'class': 'nbg'})
        title = nbg['title']
        onclick = rec.search(nbg['onclick'])
        bookhref = nbg['href']
        imghref = nbg.img['src']

        """
        <div class="rating-info">
        <span class="allstar40"></span>
        <span class="rating_nums">8.3</span>
        <span>(218人评价)</span>
        <span class="subject-cast">陈志武 / 理想国 | 台海出版社 / 2017</span>
        </div>
        """
        nbg = t.find('div', attrs={'class': 'rating-info'})
        rate_list = [str(r).strip() for r in nbg.children if str(r).strip()!='']
        if len(rate_list) == 4:
            star = re.search(r'class=\"(.+)\">', rate_list[0]).group(1)
            rating_score = re.search(r'.+rating_nums\">(.+)<\/', rate_list[1]).group(1)
            rating_number = re.search(r'.+>\((.+)\)<', rate_list[2]).group(1)
            tt = re.search(r'.+subject-cast\">(.+)<\/', rate_list[3]).group(1).split(' / ')
        elif len(rate_list) == 3:
            star = re.search(r'class=\"(.+)\">', rate_list[0]).group(1)
            rating_score = 0
            rating_number = re.search(r'.+>\((.+)\)<', rate_list[1]).group(1)
            tt = re.search(r'.+subject-cast\">(.+)<\/', rate_list[2]).group(1).split(' / ')
        else: continue
        if len(tt)>=3:
            *au, publisher, publishing=tt
            author='/'.join(au)
        else:
            author=tt[0]

        description = t.p

        sid = '-'.join([onclick.group(1),str(idx)])

        print('\n== sid', sid)
        print('== bookname', title)
        print('== link', bookhref)
        print('== img', imghref)
        print('== star', star)
        print('== score', rating_score)
        print('== ratingnum', rating_number)
        print('== author', author)
        print('== publisher', publisher)
        print('== publishing', publishing)
        print('== description ', description)

        bkinfo[sid]['bookname'] = title
        bkinfo[sid]['link'] = bookhref
        bkinfo[sid]['img'] = imghref
        bkinfo[sid]['star'] = star
        bkinfo[sid]['score'] = rating_score
        bkinfo[sid]['ratingnum'] = rating_score
        bkinfo[sid]['author'] = author
        bkinfo[sid]['publisher'] = publisher
        bkinfo[sid]['publishing'] = publishing 
        bkinfo[sid]['description'] = description

        idx += 1

        """
        return: {
          "3845101-0": {
            "link": "https://book.douban.com/subject/3845101-0",
            "bookname": "24堂财富课",
            "img": "https://img9.doubanio.com/view/subject/s/public/s29357535.jpg",
            "score": "7.8",
            "ratenum": "1793",
            "publisher": "当代中国出版社",
            "publishing": "2009",
            "author": "陈志武",
            "description": "耶鲁教授与女儿的财富对话    青年创业最温情的启发读本"
          },
          ...
        """

        """
        for rate in nbg.children:
            # rate is bs4.element.NavigableString type
            s = str(rate).strip()
            if s!='': print(s)
        """

        """
        print(nbg)
        print(sp)
        print(t)
        """

    else:
        continue
